It is important to know if a patient will be readmitted in some hospital. The reason is that you can change the treatment, in order to avoid a readmission.
In this database, you have 3 different outputs:
1. No readmission;
2. A readmission in less than 30 days
3. A readmission in more than 30 days
"The data set represents 10 years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. It includes over 50 features representing patient and hospital outcomes. Information was extracted from the database for encounters that satisfied the following criteria.
It is an inpatient encounter (a hospital admission).
It is a diabetic encounter, that is, one during which any kind of diabetes was entered to the system as a diagnosis.
The length of stay was at least 1 day and at most 14 days.
Laboratory tests were performed during the encounter.
Medications were administered during the encounter.
The data contains such attributes as patient number, race, gender, age, admission type, time in hospital, medical specialty of admitting physician, number of lab test performed, HbA1c test result, diagnosis, number of medication, diabetic medications, number of outpatient, inpatient, and emergency visits in the year before the hospitalization, etc."
The data are submitted on behalf of the Center for Clinical and Translational Research, Virginia Commonwealth University, a recipient of NIH CTSA grant UL1 TR00058 and a recipient of the CERNER data. John Clore (jclore '@' vcu.edu), Krzysztof J. Cios (kcios '@' vcu.edu), Jon DeShazo (jpdeshazo '@' vcu.edu), and Beata Strack (strackb '@' vcu.edu). This data is a de-identified abstract of the Health Facts database (Cerner Corporation, Kansas City, MO). Original source of the data set
https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008
Motivation behind this solution is following:
1. To create a model to predict readmission, so that the patient can get a better treatment.
2. Diabetes is a world-wide problem, and we should try to understand the cause & factors of Diabetes.
3. This is a try to implement modelling as well as explanation of the models.
import pandas as pd
import numpy as np
# Warnings
import warnings
warnings.filterwarnings(action='ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
# modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,roc_auc_score
# model explainers
import lime
from lime.lime_tabular import LimeTabularExplainer
import eli5
from eli5.sklearn import PermutationImportance
import shap
from shap import TreeExplainer,KernelExplainer,LinearExplainer
shap.initjs()
import time
import datetime
import platform
start = time.time()
warnings.simplefilter('ignore')
# Create table for missing data in the given dataframe.
def draw_missing_data_table(df):
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
return missing_data
dataset = pd.read_csv('data/diabetic_data.csv')
# Looking at the dataset
dataset.head().T
dataset.describe(include = 'all').T
dataset.replace('?',np.nan,inplace=True)
draw_missing_data_table(dataset)
dataset.drop(['weight','medical_specialty','payer_code'],axis=1,inplace=True)
# dropping columns related to IDs
dataset.drop(['encounter_id','patient_nbr','admission_type_id',
'discharge_disposition_id','admission_source_id'],axis=1,inplace=True)
dataset['gender'].unique()
dataset['race'].unique()
dataset['race'] = dataset['race'].fillna('Unknown')
dataset['race'].unique()
dataset.dropna(inplace=True)
dataset['readmitted'] = dataset['readmitted'].apply(lambda x: 1 if x == 'NO' else 0)
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
def feature_encoding(df, label):
cat_cols = list(df.select_dtypes('object').columns)
for col in cat_cols:
features_encoded = pd.get_dummies(df[col], prefix=col + '_is')
df = df.join(features_encoded)
print('Encoded column {}'.format(col))
df.drop(columns=[col], inplace=True)
total_columns = list(dataset.columns.values)
continuous_features = [x for x in total_columns if x not in cat_cols]
continuous_features.remove(label)
for col in continuous_features:
transf = df[col].values.reshape(-1,1)
scaler = preprocessing.StandardScaler().fit(transf)
df[col] = scaler.transform(transf)
cols = list(df.columns.values)
df = df[cols]
return df
feature_dataset = feature_encoding(dataset, 'readmitted')
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
training_dataset = feature_dataset.copy()
y_train=feature_dataset[['readmitted']]
X_train=feature_dataset.drop(['readmitted'],axis=1)
### Apply Feature Selection
feature_sel_model = SelectFromModel(Lasso(alpha=0.001, random_state=0))
# remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)
feature_sel_model.get_support()
feature_list = X_train.columns[(feature_sel_model.get_support())]
# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(feature_list)))
print(feature_list)
import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
color = sns.color_palette()
sns.set_style('darkgrid')
corrmat = training_dataset[list(feature_list) + ['readmitted']].corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
list(feature_list) + ['readmitted']
(75%/25%)
X = feature_dataset.drop('readmitted',axis=1)[list(feature_list)]
y = feature_dataset['readmitted']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
X_train.shape,X_test.shape
list(X_train.columns.values)
%%time
ML_models = {}
model_index = ['LR','RF','DT','NN']
model_sklearn = [LogisticRegression(solver='liblinear',random_state=0),
RandomForestClassifier(n_estimators=100,random_state=0),
DecisionTreeClassifier(),
MLPClassifier([100]*5,early_stopping=True,learning_rate='adaptive',random_state=0)]
model_summary = []
for name,model in zip(model_index,model_sklearn):
ML_models[name] = model.fit(X_train,y_train)
preds = model.predict(X_test)
model_summary.append([name,f1_score(y_test,preds,average='weighted'),accuracy_score(y_test,preds),
roc_auc_score(y_test,model.predict_proba(X_test)[:,1])])
print(ML_models)
model_summary = pd.DataFrame(model_summary,columns=['Name','F1_score','Accuracy','AUC_ROC'])
model_summary = model_summary.reset_index()
model_summary
g=sns.regplot(data=model_summary, x="index", y="AUC_ROC", fit_reg=False,
color="red", scatter_kws={'s':500})
for i in range(0,model_summary.shape[0]):
g.text(model_summary.loc[i,'index'], model_summary.loc[i,'AUC_ROC']+0.02, model_summary.loc[i,'Name'],
horizontalalignment='center',verticalalignment='top', size='large', color='black')
test_row = pd.DataFrame(X_test.loc[200,:]).T
test_row
#initialization of a explainer from LIME
explainer = LimeTabularExplainer(X_train.values,
mode='classification',
feature_names=X_train.columns,
class_names=['Readmitted','Not Readmitted'])
exp = explainer.explain_instance(test_row.values[0],
ML_models['LR'].predict_proba,
num_features=X_train.shape[1])
exp.show_in_notebook(show_table=True)
exp = explainer.explain_instance(test_row.values[0],
ML_models['RF'].predict_proba,
num_features=X_train.shape[1])
exp.show_in_notebook(show_table=True)
exp = explainer.explain_instance(test_row.values[0],
ML_models['DT'].predict_proba,
num_features=X_train.shape[1])
exp.show_in_notebook(show_table=True)
exp = explainer.explain_instance(test_row.values[0],
ML_models['NN'].predict_proba,
num_features=X_train.shape[1])
exp.show_in_notebook(show_table=True)
eli5.show_weights(ML_models['LR'], feature_names = list(X_test.columns),top=None)
eli5.show_prediction(ML_models['LR'], test_row.values[0],feature_names=list(X_test.columns),top=None)
exp = PermutationImportance(ML_models['LR'],
random_state = 0).fit(X_test, y_test)
eli5.show_weights(exp,feature_names=list(X_test.columns),top=None)
eli5.show_weights(ML_models['RF'],feature_names=list(X_test.columns),top=None)
eli5.show_prediction(ML_models['RF'], test_row.values[0],feature_names=list(X_test.columns),top=None)
exp = PermutationImportance(ML_models['RF'],
random_state = 0).fit(X_test, y_test)
eli5.show_weights(exp,feature_names=list(X_test.columns),top=None)
eli5.show_prediction(ML_models['DT'], test_row.values[0],feature_names=list(X_test.columns),top=None)
eli5.show_prediction(ML_models['NN'], test_row.values[0],feature_names=list(X_test.columns),top=None)
explainer = LinearExplainer(ML_models['LR'], X_train, feature_dependence="independent")
shap_values = explainer.shap_values(test_row.values)
shap.force_plot(explainer.expected_value,
shap_values,
test_row.values,
feature_names=X_test.columns)
shap_values = explainer.shap_values(X_test.head(250).values)
shap.force_plot(explainer.expected_value,
shap_values,
X_test.head(250).values,
feature_names=X_test.columns)
shap_values = explainer.shap_values(X_test.values)
spplot = shap.summary_plot(shap_values, X_test.values, feature_names=X_test.columns)
top4_cols = ['number_inpatient','number_diagnoses','diabetesMed_is_No','number_emergency']
for col in top4_cols:
shap.dependence_plot(col, shap_values, X_test)
explainer = TreeExplainer(ML_models['RF'])
shap_values = explainer.shap_values(test_row.values)
shap.force_plot(explainer.expected_value[1],
shap_values[1],
test_row.values,
feature_names=X_test.columns)
X_train_kmeans = shap.kmeans(X_train, 10)
explainer = KernelExplainer(ML_models['NN'].predict_proba,X_train_kmeans)
shap_values = explainer.shap_values(test_row.values)
shap.force_plot(explainer.expected_value[1],
shap_values[1],
test_row.values,
feature_names=X_test.columns)